# Libraries used in this analysis 

# I. universal data wrangling and visualization
library(tidyverse)      # ggplot, dplyr, purrr, forcats, stringr, etc. See https://tidyverse.tidyverse.org/
library(tidyselect)     # useful for subsetting dataframes 
library(gtools)

# II. formatting, plotting
library(scales)         # used in formatting relative abundance % labels; dollar_format()
library(knitr)          # necessary for kable, for displaying nicer tables in html/pdf output
library(ggplot2)        # customizable data visualization
library(ggpubr)         # data visualization, built on ggplot2

# III. analysis packages
library(vegan)          # decostand(), metaMDS(), scores(), rda(), ordispider(), adonis()

# IV. functions for automating and streamlining workflow
#library(devtools)      
#devtools::install_github("https://github.com/cb-42/cbmbtools")
library(cbmbtools)

#V. Rmarkdown formatting
library(pander) # format session info

This report was created with:

pander(sessionInfo()) 

R version 4.0.2 (2020-06-22)

Platform: x86_64-apple-darwin17.0 (64-bit)

locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8

attached base packages: stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: pander(v.0.6.3), cbmbtools(v.0.0.0.9034), vegan(v.2.5-6), lattice(v.0.20-41), permute(v.0.9-5), ggpubr(v.0.4.0), knitr(v.1.29), scales(v.1.1.1), gtools(v.3.8.2), tidyselect(v.1.1.0), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.1), purrr(v.0.3.4), readr(v.1.3.1), tidyr(v.1.1.1), tibble(v.3.0.3), ggplot2(v.3.3.2) and tidyverse(v.1.3.0)

loaded via a namespace (and not attached): httr(v.1.4.2), jsonlite(v.1.7.0), splines(v.4.0.2), carData(v.3.0-4), modelr(v.0.1.8), assertthat(v.0.2.1), blob(v.1.2.1), cellranger(v.1.1.0), yaml(v.2.2.1), pillar(v.1.4.6), backports(v.1.1.8), glue(v.1.4.1), digest(v.0.6.25), ggsignif(v.0.6.0), rvest(v.0.3.6), colorspace(v.1.4-1), Matrix(v.1.2-18), htmltools(v.0.5.0), pkgconfig(v.2.0.3), broom(v.0.7.0), haven(v.2.3.1), openxlsx(v.4.1.5), rio(v.0.5.16), mgcv(v.1.8-31), generics(v.0.0.2), car(v.3.0-9), ellipsis(v.0.3.1), withr(v.2.2.0), cli(v.2.0.2), magrittr(v.1.5), crayon(v.1.3.4), readxl(v.1.3.1), evaluate(v.0.14), fs(v.1.5.0), fansi(v.0.4.1), nlme(v.3.1-148), MASS(v.7.3-51.6), rstatix(v.0.6.0), xml2(v.1.3.2), foreign(v.0.8-80), tools(v.4.0.2), data.table(v.1.13.0), hms(v.0.5.3), lifecycle(v.0.2.0), munsell(v.0.5.0), reprex(v.0.3.0), cluster(v.2.1.0), zip(v.2.0.4), compiler(v.4.0.2), rlang(v.0.4.7), grid(v.4.0.2), rstudioapi(v.0.11), rmarkdown(v.2.3), codetools(v.0.2-16), gtable(v.0.3.0), abind(v.1.4-5), DBI(v.1.1.0), curl(v.4.3), R6(v.2.4.1), lubridate(v.1.7.9), stringi(v.1.4.6), parallel(v.4.0.2), Rcpp(v.1.0.5), vctrs(v.0.3.2), dbplyr(v.1.4.4) and xfun(v.0.16)

# function for computing summary statistics - error bar helper
sem <- function(x){
  sqrt(var(x, na.rm = TRUE)/length(na.omit(x)))
}
set.seed(3132)

I. 16S rRNA gene amplicon quantification

#data prep
ddpcr <- ddpcr %>% group_by(Sample_Type)

ddpcr.summary <- summarize(ddpcr, 
                            Mean=mean(Gene_16S_copies_per_mL), 
                            SEM=sqrt(var(Gene_16S_copies_per_mL)/length(Gene_16S_copies_per_mL)))

ggplot(ddpcr, aes(x=Sample_Type)) + 
  geom_col(data=ddpcr.summary, aes(x=Sample_Type, y=Mean, fill=Sample_Type), color= "black", show.legend = FALSE) +
  geom_point(aes(x=Sample_Type, y=Gene_16S_copies_per_mL)) +
  geom_errorbar(data=ddpcr.summary, aes(ymin = Mean-SEM, ymax = Mean+SEM, width = 0.3)) + 
  theme_classic() + 
  scale_y_log10(
   limits = c(10^0, 10^6),
   breaks = scales::trans_breaks("log10", function(x) 10^x),
   labels = scales::trans_format("log10", scales::math_format(10^.x)), expand = c(0,0)) + 
theme(axis.text.y = element_text(size=12), 
      axis.text.x = element_text(size=10), 
      axis.title.y = element_text(size=12, face="bold", angle = 0, vjust=0.5), 
      axis.title.x = element_text(size=12, face="bold"), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")) + 
ylab("16S rRNA Gene\nCopies/mL") +
xlab("\nSample Type") 

Hypothesis testing - 16S rRNA gene amplicon quantification

#overall difference between groups - kruskal wallis test for multiple group comparisons 
kruskal.test(Gene_16S_copies_per_mL ~ Sample_Type, data=ddpcr)

#pairwise difference between groups - Wilcoxon rank sum test, adjusted for multiple comparisons
pairwise.wilcox.test(ddpcr$Gene_16S_copies_per_mL, ddpcr$Sample_Type, p.adjust.method = "BH")

    Kruskal-Wallis rank sum test

data:  Gene_16S_copies_per_mL by Sample_Type
Kruskal-Wallis chi-squared = 29.276, df = 6, p-value = 5.394e-05


    Pairwise comparisons using Wilcoxon rank sum exact test 

data:  ddpcr$Gene_16S_copies_per_mL and ddpcr$Sample_Type 

              NTC     Iso_Ctrl PBS     Syringe_Rinse Homog_Ctrl BAL    
Iso_Ctrl      0.64000 -        -       -             -          -      
PBS           0.58947 0.40000  -       -             -          -      
Syringe_Rinse 0.42353 0.03333  0.40000 -             -          -      
Homog_Ctrl    0.58947 0.16667  0.42353 0.40000       -          -      
BAL           0.22238 0.00839  0.31818 0.27082       0.90909    -      
Lung          0.00839 0.00262  0.07955 0.00839       0.07955    0.00023

P value adjustment method: BH 

II. Quality Checks - Confirmation of Sufficient Reads

2.1 MiSeq Reads by Sample Type

#prepare data for plotting 
reads.by.sample.type.df <- otu_df

reads.by.sample.type.df$Sample_Type2 <- factor(reads.by.sample.type.df$Sample_Type2, 
                                               levels = c("Mock", "Water", "Empty", "AE", "Iso_Ctrl", "PBS", "Syringe_Rinse", "Homog_Ctrl", "BAL", "Lung", "Tongue", "Cecum"))
                                               
#plot number of MiSeq reads by sample type
ggplot(reads.by.sample.type.df, aes(x=Sample_Type2, y=Reads.PF.Sample)) + 
  geom_boxplot(aes(fill = Sample_Type2), show.legend=FALSE) + 
  geom_point() + 
  theme_classic() +
  scale_y_log10(
   limits = c(10^0, 10^6),
   breaks = scales::trans_breaks("log10", function(x) 10^x),
   labels = scales::trans_format("log10", scales::math_format(10^.x)), expand = c(0,0)) +
theme(axis.text.y = element_text(size=12), 
      axis.text.x = element_text(size=12), 
      axis.title.y = element_text(size=15, face="bold", angle = 0, vjust=0.5), 
      axis.title.x = element_text(size=15, face="bold"), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")) + 
ylab("MiSeq\nReads") +
xlab("\nSample Type") 

2.2 OTU Counts by Sample Type

# prepare data for plotting 
otu.total.counts.byExpt <- bind_cols(otu_df[,c("Sample", "Sample_Type2", "Organ")], OTU_Count = rowSums(otu_good)) %>%
  group_by(Sample_Type2) 

otu.total.counts.byExpt$Sample_Type2 <- factor(otu.total.counts.byExpt$Sample_Type2, 
                                               levels = c("Mock", "Water", "Empty", "AE", "Iso_Ctrl", "PBS", "Syringe_Rinse", "Homog_Ctrl", "BAL", "Lung", "Tongue", "Cecum"))

# plot OTU counts by sample type
ggplot(otu.total.counts.byExpt, aes(x=Sample_Type2, y=OTU_Count)) + 
  geom_boxplot(aes(fill = Sample_Type2), show.legend=FALSE) + 
  geom_point() + 
  theme_classic() +
  scale_y_log10(
   limits = c(10^0, 10^6),
   breaks = scales::trans_breaks("log10", function(x) 10^x),
   labels = scales::trans_format("log10", scales::math_format(10^.x)), expand = c(0,0)) +
theme(axis.text.y = element_text(size=12), 
      axis.text.x = element_text(size=12), 
      axis.title.y = element_text(size=15, face="bold", angle = 0, vjust=0.5), 
      axis.title.x = element_text(size=15, face="bold"), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")) + 
ylab("OTU\nCount") +
xlab("\nSample Type") 

III. Principal Component Analysis

3.1 Lung Samples vs. Sampling Controls

#extract samples of interest with specified strings - returns vector of strings and NAs
extract.lungsamp.sampctrls <- str_extract(rownames(otu_good), pattern="_BAL_|_Lung_|PBS|CLEAN|USED|Homog")

#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.lungsamp.sampctrls.good <- !is.na(extract.lungsamp.sampctrls)

#subset otu counts 
otu.good.lungsamp.sampctrls <- otu_good[extract.lungsamp.sampctrls.good, ] 

#subset otu.df for PCA
otu.df.lungsamp.sampctrls <- filter(otu_df, Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "PBS" | Sample_Type == "Clean_Rinse" | Sample_Type == "Used_Rinse" | Sample_Type == "Homog_Ctrl")

#make a hellinger normalized df and pca object from the normalized df 
make_hel_pca(otu.good.lungsamp.sampctrls)

# summarize pca object to allow for pulling the coordinates in the PC space
lungsamp_sampctrls_pca_summary <- summary(otu.good.lungsamp.sampctrls_pca) 

# select PC1 and PC2 to plot
#"sites" refers to each individual sample, i.e. ecologic sites 
pc_space <- lungsamp_sampctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3) 
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
                    
pc_loadings <- lungsamp_sampctrls_pca_summary$species # species as loading vectors

# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, Sample_Type2 = otu.df.lungsamp.sampctrls[,"Sample_Type2"], PC1 = pc_space$PC1, PC2 = pc_space$PC2) 

# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>% 
  group_by(Sample_Type2) %>%
  summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%  
  ungroup()

# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="Sample_Type2") 
# Now you can use your dataframe and mainpulate it how you want

pc_plot_data$Sample_Type2 <- factor(pc_plot_data$Sample_Type2, 
                                     levels = c("Lung", "BAL", "PBS", "Syringe_Rinse",  "Homog_Ctrl"),
                                     labels = c("Lung", "BAL", "Saline", "Syringe Rinse",  "Homogenization\nControl"))

pca_pal <- c( "#EB3D0E", #scarlet - lung
               "#0300A6", # blue - BAL
              "#666B6E", #- saline
              "#9A99E9", #- used rinse
             "#F2C121" # saffron - water
        )

# plot PCA 
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") + 
  geom_point(show.legend = FALSE, aes(color = Sample_Type2), size = 3) +
  # segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=Sample_Type2, color=Sample_Type2), size=3, show.legend = FALSE) + # centroid information
  theme_classic() +
  theme(legend.title = element_blank(), 
      axis.text.y = element_text(size=15), 
      axis.text.x = element_text(size=15), 
      axis.title.y = element_text(size=12, face="bold", vjust = 0), 
      axis.title.x = element_text(size=12, face="bold", vjust = 0), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
            ) + 
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
  scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
  labs(
    x = "\nPrincipal Component 1 (13.7% explained)",
    y = "Principal Component 2  (7.0% explained)\n"
  ) + scale_color_manual(values=pca_pal)

#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.lungsamp.sampctrls, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.lungsamp.sampctrls_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>% 
                  dplyr::select(Sample, Sample_Type, everything())

#multiple comparisons - whole lung v. BAL v. pooled sampling controls 
adonis.hel.df.wbn <- column_to_rownames(adonis.hel.df, var = "Sample") %>% 
                      dplyr::select(-Sample_Type)
adonis(adonis.hel.df.wbn~otu.df.lungsamp.sampctrls$RA_Groups, method="euclidean", permutations=10000)

#pairwise comparisons - whole lung v. bal
adonis.hel.bl <- dplyr::filter(adonis.hel.df, Sample_Type == "Lung" | Sample_Type == "BAL") %>%
                  column_to_rownames(var = "Sample") %>% 
                  dplyr::select(-Sample_Type)
adonis.otudf.bl <- dplyr::filter(otu.df.lungsamp.sampctrls, 
                    Sample_Type == "Lung" | Sample_Type == "BAL")
adonis(adonis.hel.bl~adonis.otudf.bl$Sample_Type, method="euclidean", permutations=10000)

#pairwise comparison - whole lung v. sampling negative controls 
adonis.hel.wn <- dplyr::filter(adonis.hel.df, Sample_Type != "BAL") %>% 
                  column_to_rownames(var = "Sample") %>% 
                  dplyr::select(-Sample_Type)
adonis.otudf.wn <- dplyr::filter(otu.df.lungsamp.sampctrls, Sample_Type != "BAL")
adonis(adonis.hel.wn~adonis.otudf.wn$RA_Groups, method="euclidean", permutations=10000)

#pairwise comparison - BAL v. sampling negative controls 
adonis.hel.bn <- dplyr::filter(adonis.hel.df, Sample_Type != "Lung") %>% 
                  column_to_rownames(var = "Sample") %>%
                  dplyr::select(-Sample_Type)
adonis.otudf.bn <- dplyr::filter(otu.df.lungsamp.sampctrls, Sample_Type != "Lung")
adonis(adonis.hel.bn~adonis.otudf.bn$Organ, method="euclidean", permutations=10000)

Call:
adonis(formula = adonis.hel.df.wbn ~ otu.df.lungsamp.sampctrls$RA_Groups,      permutations = 10000, method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                                    Df SumsOfSqs MeanSqs F.Model      R2
otu.df.lungsamp.sampctrls$RA_Groups  2    3.9774 1.98870  2.4676 0.16486
Residuals                           25   20.1478 0.80591         0.83514
Total                               27   24.1252                 1.00000
                                       Pr(>F)    
otu.df.lungsamp.sampctrls$RA_Groups 9.999e-05 ***
Residuals                                        
Total                                            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.bl ~ adonis.otudf.bl$Sample_Type,      permutations = 10000, method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                            Df SumsOfSqs MeanSqs F.Model      R2    Pr(>F)    
adonis.otudf.bl$Sample_Type  1     2.469 2.46898  3.1997 0.15093 9.999e-05 ***
Residuals                   18    13.889 0.77162         0.84907              
Total                       19    16.358                 1.00000              
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.wn ~ adonis.otudf.wn$RA_Groups, permutations = 10000,      method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                          Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
adonis.otudf.wn$RA_Groups  1    2.5283 2.52828   3.423 0.17623  2e-04 ***
Residuals                 16   11.8179 0.73862         0.82377           
Total                     17   14.3462                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.bn ~ adonis.otudf.bn$Organ, permutations = 10000,      method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                      Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
adonis.otudf.bn$Organ  1    0.9155 0.91549  1.0041 0.05905 0.4382
Residuals             16   14.5885 0.91178         0.94095       
Total                 17   15.5040                 1.00000       

3.2 Lung Samples v. Isolation Controls

#extract samples of interest with specified strings - returns vector of strings and NAs
extract.lungsamp.isoctrls <- str_extract(rownames(otu_good), pattern="_BAL_|_Lung_|_IsoCtrl_|AE")

#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.lungsamp.isoctrls.good <- !is.na(extract.lungsamp.isoctrls)

#subset otu counts 
otu.good.lungsamp.isoctrls <- otu_good[extract.lungsamp.isoctrls.good, ] 

#subset otu.df for PCA
otu.df.lungsamp.isoctrls <- filter(otu_df, Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "Iso_Ctrl" | Sample_Type == "AE")

#make a hellinger normalized df and pca object from the normalized df 
make_hel_pca(otu.good.lungsamp.isoctrls)

# summarize pca object to allow for pulling the coordinates in the PC space
lungsamp_isoctrls_pca_summary <- summary(otu.good.lungsamp.isoctrls_pca) 

# select PC1 and PC2 to plot
#"sites" refers to each individual sample, i.e. ecologic sites 
pc_space <- lungsamp_isoctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
                    
pc_loadings <- lungsamp_isoctrls_pca_summary$species  # species as loading vectors 

# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, Sample_Type2 = otu.df.lungsamp.isoctrls[,"Sample_Type2"], PC1 = pc_space$PC1, PC2 = pc_space$PC2) 

# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>% 
  group_by(Sample_Type2) %>%
  summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%  
  ungroup()

# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="Sample_Type2") 
# Now you can use your dataframe and mainpulate it how you want

pc_plot_data$Sample_Type2 <- factor(pc_plot_data$Sample_Type2, 
                                     levels = c("Lung", "BAL", "AE", "Iso_Ctrl"), 
                                     labels = c("Lung", "BAL", "Elution\nBuffer", "Isolation\nControl"))


pca_pal <- c( "#EB3D0E", #scarlet - lung
               "#0300A6", # blue - BAL
              "#DBE6EC", #- AE
              "#666B6E" #- iso ctrl
        )

# plot PCA
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") + 
  geom_point(show.legend = FALSE, aes(color = Sample_Type2), size = 3) +
  # segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=Sample_Type2, color=Sample_Type2), size=3, show.legend = FALSE) + # centroid information
  theme_classic() +
  theme(legend.title = element_blank(), 
      axis.text.y = element_text(size=15), 
      axis.text.x = element_text(size=15), 
      axis.title.y = element_text(size=12, face="bold", vjust = 0), 
      axis.title.x = element_text(size=12, face="bold", vjust = 0), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
            ) + 
  scale_color_manual(values = pca_pal) + 
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
  scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
  labs(
    x = "\nPrincipal Component 1 (12.1% explained)",
    y = "Principal Component 2  (9.0% explained)\n"
  ) 

#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.lungsamp.isoctrls, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.lungsamp.isoctrls_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>% 
                  dplyr::select(Sample, Sample_Type, everything())

#multiple comparisons - whole lung v. BAL v. pooled isolation controls
adonis(otu.good.lungsamp.isoctrls_hel~otu.df.lungsamp.isoctrls$Sample_Type, 
       method="euclidean", permutations=10000)

#pairwise comparison - whole lung v. iso ctrls 
adonis.hel.il <- dplyr::filter(adonis.hel.df, Sample_Type != "BAL") %>% 
                  column_to_rownames(var = "Sample") %>% dplyr::select(-Sample_Type)
adonis.otudf.il <- dplyr::filter(otu.df.lungsamp.isoctrls, Sample_Type != "BAL")
adonis(adonis.hel.il~adonis.otudf.il$RA_Groups, method="euclidean", permutations=10000)

#pairwise comparison - BAL v. iso ctrls
adonis.hel.ib <- dplyr::filter(adonis.hel.df, Sample_Type != "Lung") %>% 
                  column_to_rownames(var = "Sample") %>% dplyr::select(-Sample_Type)
adonis.otudf.ib <- dplyr::filter(otu.df.lungsamp.isoctrls, Sample_Type != "Lung")
adonis(adonis.hel.ib~adonis.otudf.ib$RA_Groups, method="euclidean", permutations=10000)

Call:
adonis(formula = otu.good.lungsamp.isoctrls_hel ~ otu.df.lungsamp.isoctrls$Sample_Type,      permutations = 10000, method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                                     Df SumsOfSqs MeanSqs F.Model      R2
otu.df.lungsamp.isoctrls$Sample_Type  3    5.1677  1.7226  2.1035 0.18392
Residuals                            28   22.9291  0.8189         0.81608
Total                                31   28.0968                 1.00000
                                        Pr(>F)    
otu.df.lungsamp.isoctrls$Sample_Type 9.999e-05 ***
Residuals                                         
Total                                             
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.il ~ adonis.otudf.il$RA_Groups, permutations = 10000,      method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                          Df SumsOfSqs MeanSqs F.Model      R2    Pr(>F)    
adonis.otudf.il$RA_Groups  1    2.7563 2.75633  3.5591 0.15107 9.999e-05 ***
Residuals                 20   15.4890 0.77445         0.84893              
Total                     21   18.2453                 1.00000              
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.ib ~ adonis.otudf.ib$RA_Groups, permutations = 10000,      method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                          Df SumsOfSqs MeanSqs F.Model      R2  Pr(>F)  
adonis.otudf.ib$RA_Groups  1    1.2216 1.22157   1.338 0.06271 0.05969 .
Residuals                 20   18.2596 0.91298         0.93729          
Total                     21   19.4812                 1.00000          
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.3 Lung Samples vs. Sequencing Controls

#extract samples of interest with specified strings - returns vector of strings and NAs
extract.lungsamp.seqctrls <- str_extract(rownames(otu_good), pattern="_BAL_|_Lung_|Water|Blank")

#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.lungsamp.seqctrls.good <- !is.na(extract.lungsamp.seqctrls)

#subset otu counts 
otu.good.lungsamp.seqctrls <- otu_good[extract.lungsamp.seqctrls.good, ] 

#subset otu.df for PCA
otu.df.lungsamp.seqctrls <- filter(otu_df, Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "Water" | Sample_Type == "Empty")

#make a hellinger normalized df and pca object from the normalized df 
make_hel_pca(otu.good.lungsamp.seqctrls)

# summarize pca object to allow for pulling the coordinates in the PC space
lungsamp_seqctrls_pca_summary <- summary(otu.good.lungsamp.seqctrls_pca) 

# select PC1 and PC2 to plot
#"sites" refers to each individual sample, i.e. ecologic sites 
pc_space <- lungsamp_seqctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
                    
pc_loadings <- lungsamp_seqctrls_pca_summary$species # species as loading vectors

# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, Sample_Type2 = otu.df.lungsamp.seqctrls[,"Sample_Type2"], PC1 = pc_space$PC1, PC2 = pc_space$PC2) 

# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>% 
  group_by(Sample_Type2) %>%
  summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%  
  ungroup()

# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="Sample_Type2") 
# Now you can use your dataframe and mainpulate it how you want

pc_plot_data$Sample_Type2 <- factor(pc_plot_data$Sample_Type2, 
                                     levels = c("Lung", "BAL", "Empty", "Water"), 
                                     labels = c("Lung", "BAL", "Empty\nWell", "Water"))


pca_pal <- c( "#EB3D0E", #scarlet - lung
               "#0300A6", # blue - BAL
              "#F2C121", #- Empty
              "#666B6E" #- Water
        )

ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") + 
  geom_point(show.legend = FALSE, aes(color = Sample_Type2), size = 3) +
  # segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=Sample_Type2, color=Sample_Type2), size=3, show.legend = FALSE) + # centroid information
  theme_classic() +
  theme(legend.title = element_blank(), 
      axis.text.y = element_text(size=15), 
      axis.text.x = element_text(size=15), 
      axis.title.y = element_text(size=12, face="bold", vjust = 0), 
      axis.title.x = element_text(size=12, face="bold", vjust = 0), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
            ) + 
  scale_color_manual(values = pca_pal) + 
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
  scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
  labs(
    x = "\nPrincipal Component 1 (13.5% explained)",
    y = "Principal Component 2  (6.6% explained)\n"
  ) 

#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.lungsamp.seqctrls, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.lungsamp.seqctrls_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>% 
                  dplyr::select(Sample, Sample_Type, everything())

#multiple comparisons - whole lung v. BAL v. pooled sequencing controls
adonis(otu.good.lungsamp.seqctrls_hel~otu.df.lungsamp.seqctrls$Sample_Type, 
      method="euclidean", 
      permutations=10000)

#pairwise comparison - whole lung v. seq ctrls 
adonis.hel.sl <- dplyr::filter(adonis.hel.df, Sample_Type != "BAL") %>% 
                  column_to_rownames(var = "Sample") %>% 
                  dplyr::select(-Sample_Type)
adonis.otudf.sl <- dplyr::filter(otu.df.lungsamp.seqctrls, Sample_Type != "BAL")
adonis(adonis.hel.sl~adonis.otudf.sl$RA_Groups, method="euclidean", permutations=10000)

#pairwise comparison - BAL v. seq ctrls
adonis.hel.sb <- dplyr::filter(adonis.hel.df, Sample_Type != "Lung") %>% 
                  column_to_rownames(var = "Sample") %>% 
                  dplyr::select(-Sample_Type)
adonis.otudf.sb <- dplyr::filter(otu.df.lungsamp.seqctrls, Sample_Type != "Lung")
adonis(adonis.hel.sb~adonis.otudf.sb$RA_Groups, method="euclidean", permutations=10000)

Call:
adonis(formula = otu.good.lungsamp.seqctrls_hel ~ otu.df.lungsamp.seqctrls$Sample_Type,      permutations = 10000, method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                                     Df SumsOfSqs MeanSqs F.Model     R2
otu.df.lungsamp.seqctrls$Sample_Type  3     8.155 2.71844  3.4176 0.1647
Residuals                            52    41.362 0.79543         0.8353
Total                                55    49.517                 1.0000
                                        Pr(>F)    
otu.df.lungsamp.seqctrls$Sample_Type 9.999e-05 ***
Residuals                                         
Total                                             
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.sl ~ adonis.otudf.sl$RA_Groups, permutations = 10000,      method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                          Df SumsOfSqs MeanSqs F.Model      R2    Pr(>F)    
adonis.otudf.sl$RA_Groups  1     4.402  4.4017  5.5369 0.11177 9.999e-05 ***
Residuals                 44    34.979  0.7950         0.88823              
Total                     45    39.381                 1.00000              
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.sb ~ adonis.otudf.sb$RA_Groups, permutations = 10000,      method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                          Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
adonis.otudf.sb$RA_Groups  1     2.083 2.08261  2.4274 0.05228  4e-04 ***
Residuals                 44    37.750 0.85795         0.94772           
Total                     45    39.833                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.4 Lung Samples vs. Tongue

#extract samples of interest with specified strings - returns vector of strings and NAs
extract.tong.lungsamp <- str_extract(rownames(otu_good), pattern="Tong|_BAL_|_Lung_")

#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.tong.lungsamp.good <- !is.na(extract.tong.lungsamp)

#subset otu counts 
otu.good.tong.lungsamp <- otu_good[extract.tong.lungsamp.good, ] 

#subset otu.df for PCA
otu.df.tong.lungsamp <- filter(otu_df, Sample_Type == "Tongue" | Sample_Type == "BAL" | Sample_Type == "Lung")

otu.df.tong.lungsamp <- mutate(otu.df.tong.lungsamp, PCA_Groups = ifelse(
                                          test = Organ == "Lung",             
                                          yes =  as.character(RA_Groups), 
                                          no = as.character(Organ))) %>% 
 dplyr::select(Sample:Gene_16S_copies_per_mL, PCA_Groups, everything())
 

otu.df.tong.lungsamp$PCA_Groups <- as.factor(otu.df.tong.lungsamp$PCA_Groups)

#make a hellinger normalized df and pca object from the normalized df 
make_hel_pca(otu.good.tong.lungsamp)

tong_lungsamp_pca_summary <- summary(otu.good.tong.lungsamp_pca) # summarize pca object to allow for pulling the coordinates in the PC space

# select PC1 and PC2 for plotting
#"sites" refers to each individual sample, i.e. ecologic sites 
pc_space <- tong_lungsamp_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
                    
pc_loadings <- tong_lungsamp_pca_summary$species # species as loading vectors 

# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, PCA_Groups = otu.df.tong.lungsamp[,"PCA_Groups"], PC1 = pc_space$PC1, PC2 = pc_space$PC2) 

# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>% 
  group_by(PCA_Groups) %>%
  summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%  
  ungroup()

# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="PCA_Groups") 
# Now you can use your dataframe and mainpulate it how you want

pc_plot_data$PCA_Groups <- factor(pc_plot_data$PCA_Groups, 
                              levels = c("Lung", "BAL", "Tongue"))
                                    

pca_pal <- c( "#EB3D0E", #scarlet - lung
               "#0300A6", # blue - BAL
              "#FB9986" #vivid tangerine - tongue
        )


ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") + 
  geom_point(show.legend = FALSE, aes(color = PCA_Groups), size = 2) +
  # segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=PCA_Groups, color=PCA_Groups), size=3, show.legend = FALSE) + # centroid information
  theme_classic() +
  theme(legend.title = element_blank(), 
      axis.text.y = element_text(size=15), 
      axis.text.x = element_text(size=15), 
      axis.title.y = element_text(size=12, face="bold", vjust = 0), 
      axis.title.x = element_text(size=12, face="bold", vjust = 0), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
            ) + 
  scale_color_manual(values = pca_pal) + 
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5)) +
  scale_y_continuous(limits = c(-1.1, 1.1), breaks = seq(-1,1,0.5)) +
  labs(
    x = "\nPrincipal Component 1 (10.5% explained)",
    y = "Principal Component 2  (4.6% explained)\n"
  ) 

#multiple comparisons - whole lung v. BAL v. tongue
adonis(otu.good.tong.lungsamp_hel~otu.df.tong.lungsamp$Sample_Type, 
       method="euclidean", 
       permutations=10000)

#prepare hellinger-transformed df to be filtered for pairwise comparisons
adonis.select.vec <- dplyr::select(otu.df.tong.lungsamp, Sample:Sample_Type)
adonis.hel.df <- rownames_to_column(otu.good.tong.lungsamp_hel, var = "Sample")
adonis.hel.df <- left_join(adonis.hel.df, adonis.select.vec, by = "Sample") %>% 
                  dplyr::select(Sample, Sample_Type, everything())

#pairwise comparison - whole lung v. tongue
adonis.hel.tl <- dplyr::filter(adonis.hel.df, Sample_Type == "Lung" | Sample_Type == "Tongue") %>%
                  column_to_rownames(var = "Sample") %>% 
                  dplyr::select(-Sample_Type)
adonis.otudf.tl <- dplyr::filter(otu.df.tong.lungsamp, 
                    Sample_Type == "Lung" | Sample_Type == "Tongue")
adonis(adonis.hel.tl~adonis.otudf.tl$Sample_Type, method="euclidean", permutations=10000)

#pairwise comparison - BAL v. tongue
adonis.hel.tb <- dplyr::filter(adonis.hel.df, Sample_Type == "BAL" | Sample_Type == "Tongue") %>%
                  column_to_rownames(var = "Sample") %>% 
                  dplyr::select(-Sample_Type)
adonis.otudf.tb <- dplyr::filter(otu.df.tong.lungsamp, Sample_Type == "BAL" | Sample_Type == "Tongue")
adonis(adonis.hel.tb~adonis.otudf.tb$Sample_Type, method="euclidean", permutations=10000)

Call:
adonis(formula = otu.good.tong.lungsamp_hel ~ otu.df.tong.lungsamp$Sample_Type,      permutations = 10000, method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                                 Df SumsOfSqs MeanSqs F.Model      R2    Pr(>F)
otu.df.tong.lungsamp$Sample_Type  2    3.9905  1.9953  2.6829 0.12665 9.999e-05
Residuals                        37   27.5168  0.7437         0.87335          
Total                            39   31.5073                 1.00000          
                                    
otu.df.tong.lungsamp$Sample_Type ***
Residuals                           
Total                               
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.tl ~ adonis.otudf.tl$Sample_Type,      permutations = 10000, method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                            Df SumsOfSqs MeanSqs F.Model      R2   Pr(>F)   
adonis.otudf.tl$Sample_Type  1    1.0559 1.05587  1.5409 0.05216 0.009199 **
Residuals                   28   19.1868 0.68524         0.94784            
Total                       29   20.2427                 1.00000            
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Call:
adonis(formula = adonis.hel.tb ~ adonis.otudf.tb$Sample_Type,      permutations = 10000, method = "euclidean") 

Permutation: free
Number of permutations: 10000

Terms added sequentially (first to last)

                            Df SumsOfSqs MeanSqs F.Model      R2    Pr(>F)    
adonis.otudf.tb$Sample_Type  1    2.6188 2.61884  3.3395 0.10656 9.999e-05 ***
Residuals                   28   21.9574 0.78419         0.89344              
Total                       29   24.5763                 1.00000              
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

3.5 Lung vs. Tongue vs. Negative Controls

#extract samples of interest with specified strings - returns vector of strings and NAs
extract.tong.lungsamp.allnegctrls <- str_extract(rownames(otu_good), pattern="Tong|_BAL_|_Lung_|PBS|CLEAN|USED|Homog|AE|Iso|Water|Blank")

#select subsets based on extracted samp names - returns a logical vector (NA --> FALSE)
extract.tong.lungsamp.allnegctrls.good <- !is.na(extract.tong.lungsamp.allnegctrls)

#subset otu counts 
otu.good.tong.lungsamp.allnegctrls <- otu_good[extract.tong.lungsamp.allnegctrls.good, ] 

#subset otu.df for PCA
otu.df.tong.lungsamp.allnegctrls <- filter(otu_df, Sample_Type == "Tongue" | Sample_Type == "BAL" | Sample_Type == "Lung" | Sample_Type == "PBS" | Sample_Type == "Clean_Rinse" | Sample_Type == "Used_Rinse" | Sample_Type == "Homog_Ctrl" | Sample_Type == "AE" | Sample_Type == "Iso_Ctrl" | Sample_Type == "Water" | Sample_Type == "Empty")

otu.df.tong.lungsamp.allnegctrls <- mutate(otu.df.tong.lungsamp.allnegctrls, PCA_Groups = ifelse(
                                          test = Organ == "Lung",             
                                          yes =  as.character(RA_Groups), 
                                          no = as.character(Organ))) %>% 
 dplyr::select(Sample:Gene_16S_copies_per_mL, PCA_Groups, everything())

otu.df.tong.lungsamp.allnegctrls$PCA_Groups <- as.factor(otu.df.tong.lungsamp.allnegctrls$PCA_Groups)

#make a hellinger normalized df and pca object from the normalized df 
make_hel_pca(otu.good.tong.lungsamp.allnegctrls)

# summarize pca object to allow for pulling the coordinates in the PC space
tong_lungsamp_relevnegctrls_pca_summary <- summary(otu.good.tong.lungsamp.allnegctrls_pca) 

# select PC1 and PC2 for plotting
#"sites" refers to each individual sample, i.e. ecologic sites 
pc_space <- tong_lungsamp_relevnegctrls_pca_summary$sites %>% as.data.frame() %>% rownames_to_column() %>% dplyr::select(1:3)
colnames(pc_space) <- c("Sample_Name", "PC1", "PC2")
                    
pc_loadings <- tong_lungsamp_relevnegctrls_pca_summary$species # species as loading vectors

# attach your data from the pca object to your labels
centroid_df <- data.frame(Sample_Name = pc_space$Sample_Name, PCA_Groups = otu.df.tong.lungsamp.allnegctrls[,"PCA_Groups"], PC1 = pc_space$PC1, PC2 = pc_space$PC2) 

# calculate the mean x coordinate and y coordinate for each sample in PC1/PC2 space
centroids <- centroid_df %>% 
  group_by(PCA_Groups) %>%
  summarize(x_coord=mean(PC1), y_coord=mean(PC2)) %>%  
  ungroup()

# to EACH SUBJECT in the pc object, add a column which contains the x and y
#coordinates for the centroid for the group that the subject belongs to
pc_plot_data <- full_join(centroid_df, centroids, by ="PCA_Groups") 
# Now you can use your dataframe and mainpulate it how you want

pc_plot_data$PCA_Groups <- factor(pc_plot_data$PCA_Groups, 
                              levels = c("Lung", "BAL", "Tongue", "Sampling Control", "Isolation Control", "Sequencing Control"))
                                    

pca_pal <- c( "#EB3D0E", #scarlet - lung
               "#0300A6", # blue - BAL
              "#FB9986", #vivid tangerine - tongue
              "#0BDBA7", #sea foam green - sampling ctrls
              "#F2C121", #- isolation ctrls 
              "#666B6E" #- sequencing ctrls
        )

# plot PCA
ggplot(pc_plot_data, aes(x = PC1, y = PC2)) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) + 
  geom_segment(aes(x=x_coord, xend = PC1, y=y_coord, yend = PC2), color = "black") + 
  geom_point(show.legend = FALSE, aes(color = PCA_Groups), size = 2) +
  # segment starting at centroid, ending at point
geom_label(aes(x=x_coord, y=y_coord, label=PCA_Groups, color=PCA_Groups), size=3, show.legend = FALSE) + # centroid information
  theme_classic() +
  theme(legend.title = element_blank(), 
      axis.text.y = element_text(size=15), 
      axis.text.x = element_text(size=15), 
      axis.title.y = element_text(size=12, face="bold", vjust = 0), 
      axis.title.x = element_text(size=12, face="bold", vjust = 0), 
      plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm")
            ) + 
  scale_color_manual(values = pca_pal) + 
  scale_x_continuous(limits = c(-0.75, 0.75), breaks =  c(-0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75)) +
  scale_y_continuous(limits = c(-0.75, 0.75), breaks = c(-0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75)) +
  labs(
    x = "\nPrincipal Component 1 (11.6% explained)",
    y = "Principal Component 2  (6.3% explained)\n"
  ) 

IV. Diversity Indices

4.1 Alpha Diversity - Rarified Richness

#Rarefaction curves can also be used to explore α-diversity. First, the data needs to be prepared.  
otu_raw_rare <- otu_raw
rownames(otu_raw_rare) <-  str_remove(string = rownames(otu_raw_rare), pattern = "_S\\d+_L001_R1_001") %>% str_remove(pattern = "_S\\d+_L001_R1") 
otu_raw_rare <- dplyr::select(otu_raw_rare, -label, -numOtus)

tvl.rownames <- vars_select(rownames(otu_raw_rare), !contains("Mock")) %>% as.data.frame() 
colnames(tvl.rownames) <- "Sample"

div.rare.bl <- rownames_to_column(otu_raw_rare, var = "Sample")
#filter out low read sample
div.rare.bl <- left_join(tvl.rownames, div.rare.bl) %>% column_to_rownames(var = "Sample")


# Generate the rarefaction data, combine with metadata, transform for plotting, and aggregate
rarefy_agg <- rarefy(div.rare.bl, sample = 1000) %>% # subsample 
  as.data.frame() %>% 
  rownames_to_column("Sample") 

colnames(rarefy_agg) <- c("Sample", "Unique_Otus_per_1k_reads")

rarefy_agg <- left_join(rarefy_agg, dplyr::select(otu_df, Sample, Organ, Sample_Type2), by = "Sample")

rarefy_agg <- dplyr::filter(rarefy_agg, !is.na(Organ))

rarefy_agg_summary  <- rarefy_agg %>% 
                       group_by(Sample_Type2) %>%
                       summarize(Mean_Species = mean(Unique_Otus_per_1k_reads), 
                                 SEM = sqrt(var(Unique_Otus_per_1k_reads)/length(Unique_Otus_per_1k_reads)))

rarefy_agg$Sample_Type2 <- factor(rarefy_agg$Sample_Type2, 
                              levels = c("Empty", "Water",  "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung",  "Tongue", "Cecum"))

rarefy_agg_summary$Sample_Type2 <- factor(rarefy_agg_summary$Sample_Type2, 
                              levels = c("Empty", "Water",  "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung",  "Tongue", "Cecum"))
rich_pal <- c( "#EB3D0E", #scarlet - lung
               "#0300A6", # blue - BAL
              "#FB9986", #vivid tangerine - tongue
              "#0BDBA7", #sea foam green - sampling ctrls
              "#F2C121", #- isolation ctrls 
              "#666B6E" #- sequencing ctrls
        )

ggplot(rarefy_agg_summary, aes(x = Sample_Type2, y = Mean_Species, fill = Sample_Type2)) + # This brings in the aggregated dataframe and assigns columns to various plot elements
  geom_col(color="black", show.legend = FALSE) + # Add bars representing the mean of specimens' diversity
  geom_point(data = rarefy_agg, aes(x = Sample_Type2, y = Unique_Otus_per_1k_reads ), width = .2, show.legend = FALSE) + # Use specimen level data, map columns to scatterplot attributes
  geom_errorbar(aes(ymin = Mean_Species - SEM, ymax = Mean_Species + SEM), width = 0.5, size=0.7) + # Add an errorbar layer
  theme_classic() + # A minimalist theme
  theme(axis.title.y = element_text(face = "bold", size = 12), 
        axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 10),
        axis.title.x = element_text(face = "bold", size = 12), 
        plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) +
  scale_y_continuous(limits = c(0,150), breaks = seq(0,150,25), expand = c(0,0)) + 
  labs(y = "Richness\n(No. of Unique OTUs/1000 reads)\n", x = "\nSample Type") 

otu_df <- mutate(otu_df, Alpha_Div_Groups =  ifelse(test = Organ != "Lung", 
                                                 yes = as.character(Organ), 
                                                 no = as.character(Sample_Type))) %>% 
        dplyr::select(Sample:Gene_16S_copies_per_mL, Alpha_Div_Groups, everything())

otu_df  <- left_join(otu_df, dplyr::select(rarefy_agg, Sample, Unique_Otus_per_1k_reads)) %>%
            dplyr::select(Sample:Gene_16S_copies_per_mL, Unique_Otus_per_1k_reads, everything())

tukey_otu_df <- filter(otu_df, Sample_Type!="Mock" & Sample_Type!="Cecum" & Sample_Type!="Tongue")

TukeyHSD(aov(tukey_otu_df[,"Unique_Otus_per_1k_reads"] ~ tukey_otu_df[, "Alpha_Div_Groups"]))
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = tukey_otu_df[, "Unique_Otus_per_1k_reads"] ~ tukey_otu_df[, "Alpha_Div_Groups"])

$`tukey_otu_df[, "Alpha_Div_Groups"]`
                                           diff        lwr         upr
Isolation Control-BAL                -10.612291 -27.844847   6.6202640
Lung-BAL                              20.846543   2.847725  38.8453597
Sampling Control-BAL                  -2.528455 -21.619083  16.5621736
Sequencing Control-BAL               -13.515401 -27.901951   0.8711478
Lung-Isolation Control                31.458834  14.226279  48.6913895
Sampling Control-Isolation Control     8.083837 -10.286129  26.4538024
Sequencing Control-Isolation Control  -2.903110 -16.318636  10.5124164
Sampling Control-Lung                -23.374997 -42.465626  -4.2843690
Sequencing Control-Lung              -34.361944 -48.748493 -19.9753948
Sequencing Control-Sampling Control  -10.986946 -26.718045   4.7441525
                                         p adj
Isolation Control-BAL                0.4262449
Lung-BAL                             0.0150895
Sampling Control-BAL                 0.9958863
Sequencing Control-BAL               0.0757056
Lung-Isolation Control               0.0000253
Sampling Control-Isolation Control   0.7329503
Sequencing Control-Isolation Control 0.9737499
Sampling Control-Lung                0.0087438
Sequencing Control-Lung              0.0000000
Sequencing Control-Sampling Control  0.2986417

4.2 Alpha Diversity - Shannon Diversity Index

#calculate Shannon diversity indices for all samples
div.shan <- diversity(otu_good, "shannon") %>% data.frame(Shannon = .) %>% rownames_to_column("Sample") %>% left_join(otu_df[,1:6], by = "Sample")

otu_df  <- mutate(otu_df, Shannon = diversity(otu_good, "shannon")) %>% dplyr::select(Sample:Alpha_Div_Groups, Shannon, everything())

# Create summary statistics
agg_shan_all <- div.shan %>%
  group_by(Sample_Type2) %>%
  summarize(Mean_Sh = mean(Shannon), SEM_Sh = sqrt(var(Shannon)/length(Shannon))) 

agg_shan_all$Sample_Type2 <- factor(agg_shan_all$Sample_Type2, 
                              levels = c("Mock", "Empty", "Water",  "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung",  "Tongue", "Cecum"))
                              
agg_shan_all <- filter(agg_shan_all, Sample_Type2 != "Mock")


div.shan$Sample_Type2 <- factor(div.shan$Sample_Type2, levels = c("Mock", "Empty", "Water",  "AE", "Iso_Ctrl","Syringe_Rinse", "PBS", "Homog_Ctrl","BAL", "Lung",  "Tongue", "Cecum"))

div.shan <- filter(div.shan, Sample_Type2 != "Mock")


ggplot(agg_shan_all, aes(x = Sample_Type2, y = Mean_Sh, fill = Sample_Type2)) + # This brings in the aggregated dataframe and assigns columns to various plot elements
  geom_col(color="black", show.legend = FALSE) + # Add bars representing the mean of specimens' Shannon diversity by vendor
  geom_point(data = div.shan, aes(x = Sample_Type2, y = Shannon), width = .2, show.legend = FALSE) + # Use specimen level data, map columns to scatterplot attributes
  geom_errorbar(aes(ymin = Mean_Sh - SEM_Sh, ymax = Mean_Sh + SEM_Sh), width = 0.5, size=0.7) + # Add an errorbar layer
  theme_classic() + # A minimalist theme
  theme(axis.title.y = element_text(face = "bold", size = 15), 
        axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 10),
        axis.title.x = element_text(face = "bold", size = 15), 
        plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) + 
  scale_y_continuous(limits = c(-0.01,4.5), breaks = seq(0,4,1), expand = c(0,0)) + 
  labs(y = "Shannon Diversity Index\n", x = "\nSample Type") 

tukey_otu_df <- dplyr::filter(otu_df, Sample_Type != "Mock" & Sample_Type != "Cecum" & Sample_Type != "Tongue")
TukeyHSD(aov(tukey_otu_df[,"Shannon"] ~ tukey_otu_df[, "Alpha_Div_Groups"]))
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = tukey_otu_df[, "Shannon"] ~ tukey_otu_df[, "Alpha_Div_Groups"])

$`tukey_otu_df[, "Alpha_Div_Groups"]`
                                           diff        lwr        upr     p adj
Isolation Control-BAL                -1.0172362 -1.8642930 -0.1701794 0.0106434
Lung-BAL                              0.5738854 -0.3108366  1.4586074 0.3728207
Sampling Control-BAL                 -0.5979371 -1.5363265  0.3404522 0.3911169
Sequencing Control-BAL               -1.2413229 -1.9484858 -0.5341600 0.0000534
Lung-Isolation Control                1.5911216  0.7440648  2.4381785 0.0000143
Sampling Control-Isolation Control    0.4192991 -0.4836665  1.3222647 0.6921918
Sequencing Control-Isolation Control -0.2240867 -0.8835195  0.4353462 0.8756718
Sampling Control-Lung                -1.1718226 -2.1102119 -0.2334332 0.0071118
Sequencing Control-Lung              -1.8152083 -2.5223712 -1.1080454 0.0000000
Sequencing Control-Sampling Control  -0.6433858 -1.4166393  0.1298678 0.1478642

4.3 Beta Diversity - Bray-Curtis dissimilarity Index within Sample Type

bray_df_lbtce <- dplyr::filter(otu_df, Sample_Type2 == "Lung" | Sample_Type2 == "Tongue" | Sample_Type2 == "Cecum" | Sample_Type2 == "BAL" | Sample_Type2 == "Empty") %>%
  droplevels() %>% 
  column_to_rownames("Sample")  %>% # This turns the specimen column into the rownames.
  dplyr::select(-c(Sample_Type:Alpha_Div_Groups, Shannon))

#identify all unique comparisons within sample types 
sample_df <- rownames(bray_df_lbtce) %>% as.data.frame()
colnames(sample_df) <- c("Sample")
sample_df <- mutate(sample_df, Sample_Type = factor(str_extract(Sample, pattern="Lung|BAL|Tongue|Cecum|Blank")))
dim(sample_df) #[1] 87  2
comb_all <- combinations(n = 87, r = 2, v = sample_df$Sample) %>% as.data.frame()
colnames(comb_all) <- c("Sample", "Comparison")
comb_all_mut <- mutate(comb_all, Sample_Type = factor(str_extract(Sample, pattern="Lung|BAL|Tongue|Cecum|Blank")),
                                    Comparison_Type = factor(str_extract(Comparison, pattern="Lung|BAL|Tongue|Cecum|Blank")))
comb_uniq <- dplyr::filter(comb_all_mut, (Sample_Type == Comparison_Type))

#calculate distance, convert to symmetric matrix, and calculate row means (one mean per sample)
bray_dist_lbtce <- vegdist(bray_df_lbtce, method = "bray")
bray_dist_lbtce <- as.matrix(bray_dist_lbtce) %>% as.data.frame() %>% rownames_to_column(var="Sample")
bray_dist_lbtce <- dplyr::select(bray_dist_lbtce, Sample, everything())

bray_dist_lbtce_long <- pivot_longer(bray_dist_lbtce, -Sample, names_to = "Comparison", values_to = "BC_Index")
bray_dist_lbtce_long <- dplyr::filter(bray_dist_lbtce_long, Sample != Comparison)

bray_dist_lbtce_long_mut_filt_uniq <- left_join(comb_uniq, bray_dist_lbtce_long)

bray_dist_lbtce_summary <- group_by(bray_dist_lbtce_long_mut_filt_uniq, Sample) %>% 
                           summarize(Mean_BC = mean(BC_Index), SEM = sem(BC_Index)) 
            
bray_dist_lbtce_summary <- ungroup(bray_dist_lbtce_summary) %>% 
                          mutate(Sample_Type = factor(str_extract(Sample,pattern="Lung|BAL|Tongue|Cecum|Blank")))

bray_dist_lbtce_summary$Sample_Type <- factor(bray_dist_lbtce_summary$Sample_Type, 
                                     levels = c("Blank", "BAL","Lung", "Tongue", "Cecum"))

ggplot(bray_dist_lbtce_summary, aes(x=Sample_Type, y = Mean_BC)) + 
  geom_boxplot(aes(fill=Sample_Type), show.legend = FALSE) + 
  geom_point() + 
  scale_y_continuous(limits = c(0,1.01), expand = c(0,0)) + 
  theme_classic() + 
  theme(axis.title.y = element_text(face = "bold", size = 15), 
        axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 10),
        axis.title.x = element_text(face = "bold", size = 15), 
        plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) + 
  labs(y = "Bray-Curtis Dissimilarity\n", x = "\nSample Type") + 
  scale_fill_manual(values=c("#DBE6EC","#0300A6", "#EB3D0E", "#0BDBA7", "#0BDBA7"))

[1] 88  2
pairwise.wilcox.test(bray_dist_lbtce_long_mut_filt_uniq$BC_Index, 
                     bray_dist_lbtce_long_mut_filt_uniq$Sample_Type, 
                     p.adjust.method = "BH")

    Pairwise comparisons using Wilcoxon rank sum test with continuity correction 

data:  bray_dist_lbtce_long_mut_filt_uniq$BC_Index and bray_dist_lbtce_long_mut_filt_uniq$Sample_Type 

       BAL     Blank   Cecum   Lung   
Blank  0.29581 -       -       -      
Cecum  < 2e-16 < 2e-16 -       -      
Lung   3.8e-12 7.1e-07 < 2e-16 -      
Tongue < 2e-16 2.4e-11 < 2e-16 0.00028

P value adjustment method: BH 

4.4 Bray-Curtis dissimilarity - Lung Samples vs. Matched Tongue Samples

bray_df_tvbl <- dplyr::filter(otu_df, Organ == "Lung" | Organ == "Tongue") %>%
  droplevels() %>% 
  column_to_rownames("Sample")  %>% # This turns the specimen column into the rownames.
  dplyr::select(-c(Sample_Type:Shannon))

#calculate distance, convert to symmetric matrix, and calculate row means (one mean per sample)
bray_dist_tvbl <- vegdist(bray_df_tvbl, method = "bray")
bray_dist_tvbl <- as.matrix(bray_dist_tvbl) %>% as.data.frame() %>% rownames_to_column(var="Sample")

bray_dist_tvbl_long <- pivot_longer(bray_dist_tvbl, -Sample, names_to = "Comparison", values_to = "BC_Index")
bray_dist_tvbl_long <- dplyr::filter(bray_dist_tvbl_long, Sample != Comparison)

bray_dist_tvbl_long_mut <- dplyr::mutate(bray_dist_tvbl_long, 
                                    Sample_Mouse = factor(str_extract(Sample, pattern="L\\d+$|B\\d+$")), 
                                    Comparison_Mouse = factor(str_extract(Comparison, pattern="L\\d+$|B\\d+$")),
                                    Sample_Type = factor(str_extract(Sample, pattern="Lung|BAL|Tongue")),
                                    Comparison_Type = factor(str_extract(Comparison, pattern="Lung|BAL|Tongue")))


bray_dist_tvbl_long_mut_filt <- dplyr::filter(bray_dist_tvbl_long_mut, (Sample_Type != Comparison_Type) & (Sample_Mouse == Comparison_Mouse) & Sample_Type == "Tongue")


ggplot(bray_dist_tvbl_long_mut_filt, aes(x = Comparison_Type, y = BC_Index)) +
  geom_boxplot(aes(fill=Comparison_Type), show.legend = FALSE) + 
  geom_point() + 
  scale_y_continuous(limits = c(0,1.1), expand = c(0,0)) + 
  theme_classic() + 
  theme(axis.title.y = element_text(face = "bold", size = 15), 
        axis.text.y = element_text(size = 15),
        axis.text.x = element_text(size = 10),
        axis.title.x = element_text(face = "bold", size = 15), 
        plot.margin=unit(c(0.5,0.5,0.5,1), "cm")) + 
  ylab("Bray-Curtis Dissimilarity\nto Matched Tongue Sample\n") + 
  scale_fill_manual(values=c( "#0300A6", "#EB3D0E"))

#Mann-Whitney U test
wilcox.test(BC_Index ~ Comparison_Type, data = bray_dist_tvbl_long_mut_filt)

    Wilcoxon rank sum exact test

data:  BC_Index by Comparison_Type
W = 95, p-value = 0.0002057
alternative hypothesis: true location shift is not equal to 0

V. Relative Abundance

5.1 Ordered by Lung OTUs

otu_df <- dplyr::mutate(otu_df, RA_Groups = ifelse(test = is.na(RA_Groups), 
                                    yes = as.character(Sample_Type), 
                                    no = as.character(RA_Groups)
                                      )) %>% 
  dplyr::select(Sample:Shannon, RA_Groups, everything()) 

otu_df$RA_Groups <- as.factor(otu_df$RA_Groups)

gg.bal.lung.neg.tong.RAG.ordL <- filter(otu_df, Sample_Type != "Cecum" & Sample_Type != "Mock") %>% 
taxon_sort_gather(facet_var="RA_Groups", ord_val = "Lung")

gg.bal.lung.neg.tong.RAG.ordL$RA_Groups <- factor(gg.bal.lung.neg.tong.RAG.ordL$RA_Groups, levels = c("Lung","Tongue", "BAL", "Negative Control"), labels = c("Lung","Tongue", "BAL", "Negative\nControls"))

plot_ra(gg.bal.lung.neg.tong.RAG.ordL, fill = "RA_Groups", error_bar = T) + facet_grid(RA_Groups~.) +
  theme_bw() + 
  geom_col(color="black", aes(fill=RA_Groups)) +
   theme(axis.text.x = element_text(size = 20, angle = 45, hjust = 1, vjust=1), 
         axis.text.y = element_text(size = 22),
         axis.title.y = element_text(face = "bold", size = 30), 
         axis.title.x = element_text(), 
         strip.text = element_text(face = "bold", size = 25, angle = 0), 
         strip.background = element_blank(),
         panel.grid = element_blank(),
         legend.position = "none", 
         panel.spacing = unit(2, "lines"),
         plot.margin=unit(c(0.5,0.5,0.5,6), "cm")) + 
  scale_y_continuous(expand = c(0,0), limits = c(0,25), labels = dollar_format(suffix = "%", prefix = "")) + 
  labs(y = "Relative Abundance (%)\n", x = "\nOTU") + 
  scale_fill_manual(values=c("#EB3D0E", "#FB9986",  "#0300A6", "#666B6E"))

5.2 Ordered by Pooled Negative Control OTUs

gg.bal.lung.neg.tong.RAG.ordN <- filter(otu_df, Sample_Type != "Cecum" & Sample_Type != "Mock") %>% 
taxon_sort_gather(facet_var="RA_Groups", ord_val = "Negative Control")

gg.bal.lung.neg.tong.RAG.ordN$RA_Groups <- factor(gg.bal.lung.neg.tong.RAG.ordN$RA_Groups, levels = c("Negative Control", "BAL", "Lung", "Tongue"), labels = c("Negative\nControls", "BAL", "Lung", "Tongue"))


plot_ra(gg.bal.lung.neg.tong.RAG.ordN, fill = "RA_Groups", error_bar = T) + facet_grid(RA_Groups~.) +
  theme_bw() + 
  geom_col(color="black", aes(fill=RA_Groups)) +
   theme(axis.text.x = element_text(size = 20, angle = 45, hjust = 1, vjust=1), 
         axis.text.y = element_text(size = 22),
         axis.title.y = element_text(face = "bold", size = 30), 
         axis.title.x = element_text(), 
         strip.text = element_text(face = "bold", size = 25, angle = 0), 
         strip.background = element_blank(),
         panel.grid = element_blank(),
         legend.position = "none", 
         panel.spacing = unit(2, "lines"),
         plot.margin=unit(c(0.5,0.5,0.5,6), "cm")) + 
  scale_y_continuous(expand = c(0,0), limits = c(0,25), labels = dollar_format(suffix = "%", prefix = "")) + 
  labs(y = "Relative Abundance (%)\n", x = "\nOTU") + 
  scale_fill_manual(values=c("#666B6E",  "#0300A6", "#EB3D0E", "#FB9986"))

5.3 Ordered by Tongue OTUs

gg.bal.lung.neg.tong.RAG.ordT <- filter(otu_df, Sample_Type != "Cecum" & Sample_Type != "Mock") %>% 
taxon_sort_gather(facet_var="RA_Groups", ord_val = "Tongue")

gg.bal.lung.neg.tong.RAG.ordT$RA_Groups <- factor(gg.bal.lung.neg.tong.RAG.ordT$RA_Groups, levels = c("Tongue",  "Lung","BAL", "Negative Control"), labels = c("Tongue", "Lung","BAL", "Negative\nControls"))

plot_ra(gg.bal.lung.neg.tong.RAG.ordT, fill = "RA_Groups", error_bar = T) + facet_grid(RA_Groups~.) +
  theme_bw() + 
  geom_col(color="black", aes(fill=RA_Groups)) +
   theme(axis.text.x = element_text(size = 20, angle = 45, hjust = 1, vjust=1), 
         axis.text.y = element_text(size = 22),
         axis.title.y = element_text(face = "bold", size = 30), 
         axis.title.x = element_text(), 
         strip.text = element_text(face = "bold", size = 25, angle = 0), 
         strip.background = element_blank(),
         panel.grid = element_blank(),
         legend.position = "none", 
         panel.spacing = unit(2, "lines"),
         plot.margin=unit(c(0.5,0.5,0.5,6), "cm")) + 
  scale_y_continuous(expand = c(0,0), limits = c(0,25), labels = dollar_format(suffix = "%", prefix = "")) + 
  labs(y = "Relative Abundance (%)\n", x = "\nOTU") + 
  scale_fill_manual(values=c("#FB9986", "#EB3D0E", "#0300A6", "#666B6E"))